***************************************
** Title	: 	Data Setup
** Data		:	NIDS				
** Author(s): 	Mo Alloush			
** Date		:	January, 2023			
***************************************

*Setup by the folks at NIDS (University of Cape Town). Download data from NIDS, it comes with setup do-files that combine the 5 waves. It will save it as below.
use NIDS_1_2_3_4_5, clear

*Deflators to get 2016 prices

scalar base=127.7 

gen w1_deflator=.
replace w1_deflator=base/74.8 if w1_h_intrv_m==1
replace	w1_deflator=base/75.1 if w1_h_intrv_m==2
replace	w1_deflator=base/76.3 if w1_h_intrv_m==3
replace	w1_deflator=base/77.7 if w1_h_intrv_m==4
replace	w1_deflator=base/78.5 if w1_h_intrv_m==5
replace	w1_deflator=base/79.6 if w1_h_intrv_m==6
replace	w1_deflator=base/81.2 if w1_h_intrv_m==7
replace	w1_deflator=base/81.8 if w1_h_intrv_m==8
replace	w1_deflator=base/81.9 if w1_h_intrv_m==9
replace	w1_deflator=base/81.9 if w1_h_intrv_m==10
replace	w1_deflator=base/82.0 if w1_h_intrv_m==11
replace	w1_deflator=base/81.1 if w1_h_intrv_m==12
replace	w1_deflator=base/76.3 if w1_h_intrv_m==33

lab var w1_deflator "Wave 1 deflator"

gen w2_deflator=.
replace w2_deflator=base/88.0 if w2_h_intrv_m==5 & w2_h_intrv_y==2010
replace w2_deflator=base/88.0 if w2_h_intrv_m==6 & w2_h_intrv_y==2010
replace w2_deflator=base/88.6 if w2_h_intrv_m==7 & w2_h_intrv_y==2010
replace w2_deflator=base/88.6 if w2_h_intrv_m==8 & w2_h_intrv_y==2010
replace w2_deflator=base/88.7 if w2_h_intrv_m==9 & w2_h_intrv_y==2010
replace w2_deflator=base/88.9 if w2_h_intrv_m==10 & w2_h_intrv_y==2010
replace w2_deflator=base/89.0 if w2_h_intrv_m==11 & w2_h_intrv_y==2010
replace w2_deflator=base/89.2 if w2_h_intrv_m==12 & w2_h_intrv_y==2010
replace w2_deflator=base/89.6 if w2_h_intrv_m==1 & w2_h_intrv_y==2011
replace w2_deflator=base/90.2 if w2_h_intrv_m==2 & w2_h_intrv_y==2011
replace w2_deflator=base/91.3 if w2_h_intrv_m==3 & w2_h_intrv_y==2011
replace w2_deflator=base/91.6 if w2_h_intrv_m==4 & w2_h_intrv_y==2011
replace w2_deflator=base/92.0 if w2_h_intrv_m==5 & w2_h_intrv_y==2011
replace w2_deflator=base/93.2 if w2_h_intrv_m==7 & w2_h_intrv_y==2011
replace w2_deflator=base/93.4 if w2_h_intrv_m==8 & w2_h_intrv_y==2011
replace w2_deflator=base/93.8 if w2_h_intrv_m==9 & w2_h_intrv_y==2011
replace w2_deflator=base/88.7 if w2_h_intrv_m==. & w2_h_outcome==1

lab var w2_deflator "Wave 2 deflator"

gen w3_deflator=.
replace w3_deflator=base/97.2 if w3_h_intrv_m==4
replace w3_deflator=base/97.2 if w3_h_intrv_m==5
replace w3_deflator=base/97.5 if w3_h_intrv_m==6
replace w3_deflator=base/97.8 if w3_h_intrv_m==7
replace w3_deflator=base/98.0 if w3_h_intrv_m==8
replace w3_deflator=base/98.9 if w3_h_intrv_m==9
replace w3_deflator=base/99.5 if w3_h_intrv_m==10
replace w3_deflator=base/99.8 if w3_h_intrv_m==11
replace w3_deflator=base/100.0 if w3_h_intrv_m==12
replace w3_deflator=base/98.0 if w3_h_intrv_m==. & w3_h_outcome==1

lab var w3_deflator "Wave 3 deflator"

gen w4_deflator=.

replace w4_deflator=base/111.2 if w4_h_intrv_m==10 & w4_h_intrv_y==2014
replace w4_deflator=base/111.2 if w4_h_intrv_m==11 & w4_h_intrv_y==2014
replace w4_deflator=base/111.0 if w4_h_intrv_m==12 & w4_h_intrv_y==2014

replace w4_deflator=base/110.8 if w4_h_intrv_m==1 & w4_h_intrv_y==2015
replace w4_deflator=base/111.5 if w4_h_intrv_m==2 & w4_h_intrv_y==2015
replace w4_deflator=base/113.1 if w4_h_intrv_m==3 & w4_h_intrv_y==2015
replace w4_deflator=base/114.1 if w4_h_intrv_m==4 & w4_h_intrv_y==2015
replace w4_deflator=base/114.4 if w4_h_intrv_m==5 & w4_h_intrv_y==2015
replace w4_deflator=base/114.9 if w4_h_intrv_m==6 & w4_h_intrv_y==2015
replace w4_deflator=base/116.1 if w4_h_intrv_m==7 & w4_h_intrv_y==2015
replace w4_deflator=base/116.1 if w4_h_intrv_m==8 & w4_h_intrv_y==2015

replace w4_deflator=base/111.2 if w4_h_intrv_m==. & w4_h_outcome==1

lab var w4_deflator "Wave 4 deflator"

gen w5_deflator=.

replace w5_deflator=base/125.4 if w5_h_intrv_m==1 & w5_h_intrv_y==2017
replace w5_deflator=base/126.8 if w5_h_intrv_m==2 & w5_h_intrv_y==2017
replace w5_deflator=base/127.6 if w5_h_intrv_m==3 & w5_h_intrv_y==2017
replace w5_deflator=base/127.7 if w5_h_intrv_m==4 & w5_h_intrv_y==2017
replace w5_deflator=base/128.1 if w5_h_intrv_m==5 & w5_h_intrv_y==2017
replace w5_deflator=base/128.3 if w5_h_intrv_m==6 & w5_h_intrv_y==2017
replace w5_deflator=base/128.7 if w5_h_intrv_m==7 & w5_h_intrv_y==2017
replace w5_deflator=base/128.8 if w5_h_intrv_m==8 & w5_h_intrv_y==2017
replace w5_deflator=base/129.4 if w5_h_intrv_m==9 & w5_h_intrv_y==2017
replace w5_deflator=base/129.8 if w5_h_intrv_m==10 & w5_h_intrv_y==2017
replace w5_deflator=base/129.9 if w5_h_intrv_m==11 & w5_h_intrv_y==2017
replace w5_deflator=base/130.6 if w5_h_intrv_m==12 & w5_h_intrv_y==2017

replace w5_deflator=base/127.7 if w5_h_intrv_m==. & w5_h_outcome==1

lab var w5_deflator "Wave 5 deflator"

*Which waves households are in the Panel?
forvalues i=1/5 {
gen wave_`i' = w`i'_hh_outcome==1
}

rename wave_died w_died
egen wave_total = rowtotal(wave_*)
bysort w1_hhid: egen hh_wave_total = max(wave_total)
tab hh_wave_total

*Number of children, 15-65, elderly, women over 60 and men over 65 in household in each wave
forvalues i=1/5 {
bysort w`i'_hhid: egen w`i'_children = count(w`i'_c_resrel)

gen w`i'_1565age = .
replace w`i'_1565age = 1 if w`i'_age>=15 & w`i'_age<=65
bysort w`i'_hhid: egen w`i'_waa = count(w`i'_1565age)

gen w`i'_65p = .
replace w`i'_65p = 1 if w`i'_age>65
bysort w`i'_hhid: egen w`i'_eld = count(w`i'_65p)

gen w`i'_w60p = .
replace w`i'_w60p = 1 if w`i'_age>=60 & w`i'_best_gen==2
bysort w`i'_hhid: egen w`i'_wpension = count(w`i'_w60p)

}

*Psychological Well-being by round +ve outcome dummy

forvalues i=1/5 {
gen w`i'_em1 = w`i'_a_emobth==1 if !missing(w`i'_a_emobth)
gen w`i'_em2 = w`i'_a_emomnd==1 if !missing(w`i'_a_emomnd)
gen w`i'_em3 = w`i'_a_emodep==1 if !missing(w`i'_a_emodep)
gen w`i'_em4 = w`i'_a_emoeff==1 if !missing(w`i'_a_emoeff)
gen w`i'_em5 = w`i'_a_emohope==1 if !missing(w`i'_a_emohope)
gen w`i'_em6 = w`i'_a_emofear==1 if !missing(w`i'_a_emofear)
gen w`i'_em7 = w`i'_a_emoslp==1 if !missing(w`i'_a_emoslp)
gen w`i'_em8 = w`i'_a_emohap==1 if !missing(w`i'_a_emohap)
gen w`i'_em9 = w`i'_a_emolone==1 if !missing(w`i'_a_emolone)
gen w`i'_em10 = w`i'_a_emogo==1 if !missing(w`i'_a_emogo)
}

*Generating CES-D scale

forvalues i=1/5 {
gen w`i'_emo1 = w`i'_a_emobth-1 if !missing(w`i'_a_emobth)
replace w`i'_emo1 = . if w`i'_emo1<0
gen w`i'_emo2 = w`i'_a_emomnd-1 if !missing(w`i'_a_emomnd)
replace w`i'_emo2 = . if w`i'_emo2<0
gen w`i'_emo3 = w`i'_a_emodep-1 if !missing(w`i'_a_emodep)
replace w`i'_emo3 = . if w`i'_emo3<0
gen w`i'_emo4 = w`i'_a_emoeff-1 if !missing(w`i'_a_emoeff)
replace w`i'_emo4 = . if w`i'_emo4<0

gen w`i'_emo5 = 3 if w`i'_a_emohope==1
replace w`i'_emo5 = 1 if w`i'_a_emohope==3
replace w`i'_emo5 = 2 if w`i'_a_emohope==2
replace w`i'_emo5 = 0 if w`i'_a_emohope==4
replace w`i'_emo5 = . if w`i'_emo5<0

gen w`i'_emo6 = w`i'_a_emofear-1 if !missing(w`i'_a_emofear)
replace w`i'_emo6 = . if w`i'_emo6<0

gen w`i'_emo7 = w`i'_a_emoslp-1 if !missing(w`i'_a_emoslp)
replace w`i'_emo7 = . if w`i'_emo7<0

gen w`i'_emo8 = 3 if w`i'_a_emohap==1
replace w`i'_emo8 = 1 if w`i'_a_emohap==3
replace w`i'_emo8 = 2 if w`i'_a_emohap==2
replace w`i'_emo8 = 0 if w`i'_a_emohap==4
replace w`i'_emo8 = . if w`i'_emo8<0

gen w`i'_emo9 = w`i'_a_emolone-1 if !missing(w`i'_a_emolone)
replace w`i'_emo9 = . if w`i'_emo9<0
gen w`i'_emo10 = w`i'_a_emogo-1 if !missing(w`i'_a_emogo)
replace w`i'_emo10 = . if w`i'_emo10<0

gen w`i'_scale = w`i'_emo1 + w`i'_emo2 + w`i'_emo3 + w`i'_emo4 ///
				+ w`i'_emo5 + w`i'_emo6 + w`i'_emo7 ///
				+ w`i'_emo8 + w`i'_emo9 + w`i'_emo10
				}

*Depression: CES-D>=10
forvalues i=1/5 {
	gen w`i'_depressed = 0 if w`i'_scale!=.
	replace w`i'_depressed = 1 if w`i'_scale>=10 & w`i'_scale!=.
	}

****Basic Demographics
forvalues i=1/5{
gen w`i'_age = w`i'_best_age_yrs
gen w`i'_male = w`i'_best_gen==1
}

*Income variables
/*
*variables
Household Size: w1_hhsizer
HH montly Income full imputation: w1_hhincome
HH monthly expenditure: w1_expf

*/

forvalues i=1/5{
replace w`i'_hhincome = w`i'_hhincome*w`i'_deflator
gen w`i'_hhincome_per = w`i'_hhincome/w`i'_hhsizer

replace w`i'_expf = w`i'_expf*w`i'_deflator
gen w`i'_expf_per = w`i'_expf/w`i'_hhsizer

replace w`i'_expnf = w`i'_expnf*w`i'_deflator
gen w`i'_expnf_per = w`i'_expnf/w`i'_hhsizer

gen w`i'_hhincome_per_sq = w`i'_hhincome_per^2
gen w`i'_expf_per_sq = w`i'_expf_per^2
gen w`i'_expnf_per_sq = w`i'_expnf_per^2
}


**Labor Supply variables
forvalues i=1/2 {
gen w`i'_active = 0 if w`i'_empl_stat_inclprox!=.
replace w`i'_active = 1 if w`i'_empl_stat_inclprox==1 | w`i'_empl_stat_inclprox==2
gen w`i'_employed = 0 if w`i'_empl_stat_inclprox==1
replace w`i'_employed = 1 if w`i'_empl_stat_inclprox==2 
}
forvalues i=3/5 {
gen w`i'_active = 0 if w`i'_empl_stat!=.
replace w`i'_active = 1 if w`i'_empl_stat==1 | w`i'_empl_stat==2 | w`i'_empl_stat==3
gen w`i'_employed = 0 if w`i'_empl_stat==1 | w`i'_empl_stat==2 
replace w`i'_employed = 1 if w`i'_empl_stat==3 
}

*Hours worked
forvalues i=1/5{
egen w`i'_hours = rowtotal(w`i'_a_em1hrs w`i'_a_em2hrs w`i'_a_emshrs w`i'_a_emchrs w`i'_a_emphrs w`i'_a_emhhrs)
replace w`i'_hours = . if w`i'_hours<0
}

*Total Income

forvalues i=1/5 {
egen w`i'_totindinc = rowtotal(w`i'_fwag w`i'_swag w`i'_cwag w`i'_cheq ///
		w`i'_prof w`i'_extr w`i'_bonu w`i'_othe w`i'_help w`i'_indi ///
		w`i'_rnt w`i'_retr w`i'_brid w`i'_loan w`i'_sale w`i'_spen w`i'_ppen ///
		w`i'_uif w`i'_comp w`i'_dis w`i'_chld w`i'_fost)
replace w`i'_totindinc = w`i'_totindinc*w`i'_deflator

}

*Household Wealth Index

set more off
forvalues i=1/5 {
gen w`i'_wealth_radio = w`i'_h_ownrad==1 if !missing(w`i'_h_ownrad)
replace w`i'_wealth_radio=. if w`i'_h_ownrad<0
gen w`i'_wealth_vehicle = w`i'_h_ownvehpri==1 if !missing(w`i'_h_ownvehpri)
replace w`i'_wealth_vehicle=. if w`i'_h_ownvehpri<0
gen w`i'_wealth_farm1 = w`i'_h_ownplg==1 if !missing(w`i'_h_ownplg)
replace w`i'_wealth_farm1=. if w`i'_h_ownplg<0
gen w`i'_wealth_farm2 = w`i'_h_owntra==1 if !missing(w`i'_h_owntra)
replace w`i'_wealth_farm2=. if w`i'_h_owntra<0
gen w`i'_wealth_farm3 = w`i'_h_ownwhl==1 if !missing(w`i'_h_ownwhl)
replace w`i'_wealth_farm3=. if w`i'_h_ownwhl<0
gen w`i'_wealth_farm4 = w`i'_h_ownmll==1 if !missing(w`i'_h_ownmll)
replace w`i'_wealth_farm4=. if w`i'_h_ownmll<0
gen w`i'_wealth_telev = w`i'_h_owntel==1 if !missing(w`i'_h_owntel)
replace w`i'_wealth_telev=. if w`i'_h_owntel<0
gen w`i'_wealth_stove = w`i'_h_ownelestv==1 if !missing(w`i'_h_ownelestv)
replace w`i'_wealth_stove = 1 if w`i'_h_owngasstv==1
replace w`i'_wealth_stove=. if w`i'_h_ownelestv<0

gen w`i'_wealth_cell = w`i'_h_owncel==1 if !missing(w`i'_a_owncel)
replace w`i'_wealth_cell=. if w`i'_h_owncel<0
gen w`i'_wealth_tel = w`i'_h_owntel==1 if !missing(w`i'_h_owntel)
replace w`i'_wealth_tel=. if w`i'_h_owntel<0
gen w`i'_wealth_compu = w`i'_h_owncom==1  if !missing(w`i'_h_owncom)
replace w`i'_wealth_compu=. if w`i'_h_owncom<0
gen w`i'_wealth_micro = w`i'_h_ownmic==1 if !missing(w`i'_h_ownmic)
replace w`i'_wealth_micro=. if w`i'_h_ownmic<0
gen w`i'_wealth_frdg = w`i'_h_ownfrg==1 if !missing(w`i'_h_ownfrg)
replace w`i'_wealth_frdg=. if w`i'_h_ownfrg<0
gen w`i'_wealth_toil = w`i'_h_toi==1 if !missing(w`i'_h_toi)
replace w`i'_wealth_toil=. if w`i'_h_toi<0

factor w`i'_wealth_*, pcf 
predict w`i'_wealthindex, 
}

forvalues i=1/5 {
xtile w`i'_wealth_quintile = w`i'_wealthindex, nq(5)
xtile w`i'_wealth_decile = w`i'_wealthindex, nq(10)
}

*Education
forvalues i=1/5 {
gen w`i'_noschool = w`i'_best_edu==25 if !missing(w`i'_best_edu)
gen w`i'_gr0 = w`i'_best_edu==0 if !missing(w`i'_best_edu)
gen w`i'_primary = w`i'_best_edu>=1 & w`i'_best_edu<=5 if !missing(w`i'_best_edu)
gen w`i'_middle = w`i'_best_edu>=6 & w`i'_best_edu<=8 if !missing(w`i'_best_edu)
gen w`i'_secondary = w`i'_best_edu>=9 & w`i'_best_edu<=12 if !missing(w`i'_best_edu)
gen w`i'_diploma = w`i'_best_edu>=13 & w`i'_best_edu<=19 if !missing(w`i'_best_edu)
gen w`i'_tertiary = w`i'_best_edu>=20 & w`i'_best_edu<=23 if !missing(w`i'_best_edu)
}

rename w4_best_marstt  w4_a_marstt
rename w5_a_mar w5_a_marstt

forvalues i=1/5{
gen w`i'_married = 0 if w`i'_a_marstt!=.
replace w`i'_married = 1 if w`i'_a_marstt==1 | w`i'_a_marstt==2 
}


save data_sept2021, replace

*dropping outliers
gen dw4_hhincome_per = w4_hhincome_per - w3_hhincome_per
gen dw4_totindinc = w4_totindinc - w3_totindinc
gen dw4_scale = w4_scale - w3_scale

centile dw4_hhincome_per, c(2.5 97.5)
local l=r(c_1)
local r=r(c_2)
centile dw4_totindinc, c(2.5 97.5)
local tl=r(c_1)
local tr=r(c_2)

keep if inrange(dw4_hhincome_per, `l', `r')
keep if inrange(dw4_totindinc, `tl', `tr')

centile dw4_scale, c(0.1 99.9)
local tl=r(c_1)
local tr=r(c_2)
keep if inrange(dw4_scale, `tl', `tr')


centile w4_hhincome_per, c(0.25 99.75)
local l=r(c_1)
local r=r(c_2)
keep if inrange(w4_hhincome_per, `l', `r')

centile w3_hhincome_per, c(0.25 99.75)
local l=r(c_1)
local r=r(c_2)
keep if inrange(w3_hhincome_per, `l', `r')

centile w2_hhincome_per, c(0.25 99.75)
local l=r(c_1)
local r=r(c_2)
keep if inrange(w2_hhincome_per, `l', `r')

**Low hanging fruit variable creation**
gen male = 0
replace male =1 if w1_a_gen==1

gen african = 0
replace african = 1 if w1_a_popgrp==1

forvalues i=1/4{
gen w`i'_disability = 0
replace w`i'_disability = 1 if w`i'_a_hlser==1
}

/*Dropping noise mental health variables: These are noisy because it suggests 
people are just going down the list and tapping the leftmost button. Results
are robust to keeping these in.*/
/*
forvalues i=1/5{
drop if w`i'_scale==6 & w`i'_emo5==3 & w`i'_emo8==3
}
*/

save data_sept2021, replace

rename w3_h_respondent w3_h_pcode_pid
rename w4_h_respondent w4_h_pcode_pid
rename w5_h_respondent w5_h_pcode_pid

*Changing from wide to long

keep cluster 	w*_a_emo* w*_questionnaire w*_scale w*_hhsizer w*_expf_per pid w*_totindinc ///
				w*_age male african w*_noschool w*_primary w*_middle w*_secondary w*_diploma w*_tertiary w*_a_marstt ///
				w*_hhid w*_employed w*_active w*_hhincome_per ///
				w*_prov2011 w*_geo2011 w*_dc2011 w*_mdbdc2011 ///
				w*_wealthindex w*_wealthindex* w*_wealth_decile w*_wealth_quintile w*_disability ///
				w*_pweight w*_children w*_1565age w*_65p w*_stayer w*_h_intrv_y w*_hours ///
	
reshape long 	w@_questionnaire w@_hhid w@_totindinc w@_scale w@_hhsizer w@_expf_per ///
				w@_noschool w@_primary w@_middle w@_secondary w@_diploma w@_tertiary ///
				w@_employed w@_wealthindex  w@_active w@_age w@_hhincome_per ///
				w@_h_pcode_pid w@_prov2011 w@_geo2011 w@_dc2011 w@_mdbdc2011 ///
				w@_children w@_1565age w@_65p ///
				w@_wealth_decile w@_wealth_quintile ///
				w@_poor w@_pweight ///
				w@_stayer w@_a_marstt ///
				w@_a_emobth w@_a_emomnd w@_a_emodep w@_a_emoeff w@_a_emohope w@_a_emofear w@_a_emoslp 				 w@_a_emohap w@_a_emolone w@_a_emogo w@_h_intrv_y w@_disability w@_hours  ///
				, i(pid) j(wave)

drop if w_hhid==.

xtset pid wave
sort pid wave

*Creating new variables*
gen w_married = w_a_marstt==1

*Depression risk indicator
gen depressed10 = (w_scale>=10) if !missing(w_scale)
gen depressed11 = (w_scale>=11) if !missing(w_scale)
gen depressed12 = (w_scale>=12) if !missing(w_scale)
egen w_depressionprone = max(depressed10), by(pid)

replace w_totindinc = w_totindinc/100
replace w_hhincome_per = w_hhincome_per/100
replace w_expf_per = w_expf_per/100

gen dw_scale = d.w_scale
gen dw_hhincome_per = d.w_hhincome_per
gen dw_totindinc = d.w_totindinc

gen w_hhincome_per_sq = w_hhincome_per^2
gen w_totindinc_sq = w_totindinc^2
gen w_expf_per_sq= w_expf_per^2

gen logw_hhincome_per = log(w_hhincome_per+1)
gen logw_expf_per = log(w_expf_per+1)
gen logw_totindinc = log(w_totindinc)


save data_with5, replace
capture drop if wave==5
